In [158]:
#### Synthea Health: Data Analysis and Insights Generation
In [ ]:
!pip install -r requirements.txt
In [18]:
#import the required funcions and packages for EDA

import pandas as pd
from utils.utils import read_data_parse_dates,get_demographic_plot
import json
import plotly.express as px

Import the required required cleaned and transformed data and also required functions

In [20]:
config_file_path = './config/config.json'

# Load the JSON file
with open(config_file_path, 'r') as file:
    config = json.load(file)

# Access the value of 'input_folder'
output_folder = config.get('cleaned_data_path')
print("output Folder:", output_folder)
output Folder: C://Users//veena.vemula//Documents//GitHub//SDE-Skills-Test//Output
In [22]:
df_phr =  read_data_parse_dates(output_folder,"diagnosis_data",["BIRTHDATE"])
df_patients =  read_data_parse_dates(output_folder,"patients_cleaned",["BIRTHDATE"])
df_encounters =  read_data_parse_dates(output_folder,"encounters_cleaned",["START_ENC","STOP_ENC"])
df_conditions =  read_data_parse_dates(output_folder,"conditions_cleaned",["START_DIAG","STOP_DIAG"])
df_medications =  read_data_parse_dates(output_folder,"medications_cleaned",["START_MED","STOP_MED"])
df_procedures =  read_data_parse_dates(output_folder,"procedures_cleaned",["START_PROC","STOP_PROC"])

Read all the input files from the output folder

In [23]:
import matplotlib.pyplot as plt
import seaborn as sns
In [24]:
plt.figure(figsize=(10, 6))
sns.histplot(df_phr['age_diag'], bins=10, kde=True, color='blue', alpha=0.7)
plt.title('Histogram of Patient Age Distribution', fontsize=16)
plt.xlabel('Age', fontsize=14)
plt.ylabel('Number of Patients', fontsize=14)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Show the plot
plt.show()
No description has been provided for this image

Age distribution shows normal distribution with a most of the patients between age group 20 and 80. In the data there are patients above 100 age group - average life expectancy is 80 and there are more female patients compared to male in above 100 age group. Female are more prone to diseases than males but males life expectancy is shorter than females

In [27]:
patient_counts_by_diagnosis = df_phr.groupby('DESCRIPTION')['PATIENT_ID'].nunique()
patient_counts_sorted = patient_counts_by_diagnosis.sort_values(ascending=False)

print(patient_counts_sorted.head(30))
DESCRIPTION
Stress (finding)                                    894
Viral sinusitis (disorder)                          739
Limited social contact (finding)                    661
Social isolation (finding)                          653
Acute viral pharyngitis (disorder)                  506
Not in labor force (finding)                        501
Victim of intimate partner abuse (finding)          466
Body mass index 30+ - obesity (finding)             463
Acute bronchitis (disorder)                         454
Prediabetes                                         341
Anemia (disorder)                                   324
Hypertension                                        292
Risk activity involvement (finding)                 289
Normal pregnancy                                    217
Chronic sinusitis (disorder)                        216
Has a criminal record (finding)                     206
Miscarriage in first trimester                      204
Chronic low back pain (finding)                     195
Otitis media                                        160
Unhealthy alcohol drinking behavior (finding)       154
Housing unsatisfactory (finding)                    154
Streptococcal sore throat (disorder)                146
Hyperlipidemia                                      138
Only received primary school education (finding)    123
Chronic neck pain (finding)                         123
Sprain of ankle                                     110
Transport problems (finding)                        107
Severe anxiety (panic) (finding                     107
Lack of access to transportation (finding)          104
Suspected COVID-19                                   96
Name: PATIENT_ID, dtype: int64
In [30]:
top_15_diagnoses = df_phr.groupby('DESCRIPTION').size().sort_values(ascending = False).head(15)
# Replace diagnoses not in top 15 with "Others"
GroupedDiagnosis = df_phr['DESCRIPTION'].apply(
    lambda x: x if x in top_15_diagnoses else 'Others'
)
GroupedDiagnosis.value_counts()
Out[30]:
DESCRIPTION
Others                                        5994
Stress (finding)                              5137
Social isolation (finding)                    1243
Viral sinusitis (disorder)                    1233
Limited social contact (finding)              1200
Not in labor force (finding)                  1077
Victim of intimate partner abuse (finding)     819
Acute viral pharyngitis (disorder)             678
Acute bronchitis (disorder)                    571
Normal pregnancy                               527
Body mass index 30+ - obesity (finding)        463
Risk activity involvement (finding)            392
Prediabetes                                    341
Anemia (disorder)                              324
Hypertension                                   292
Chronic sinusitis (disorder)                   219
Name: count, dtype: int64
In [32]:
top_15_diagnoses = GroupedDiagnosis.value_counts().head(30)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_15_diagnoses.values, y=top_15_diagnoses.index, palette='viridis')

plt.title('Top 15 Most Frequent Diagnoses', fontsize=16)
plt.xlabel('Number of Patients', fontsize=14)
plt.ylabel('Diagnosis Description', fontsize=14)

plt.grid(axis='x', linestyle='--', alpha=0.7)
plt.tight_layout()

plt.show()
C:\Users\veena.vemula\AppData\Local\Temp\ipykernel_9512\3395157393.py:4: FutureWarning: 

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=top_15_diagnoses.values, y=top_15_diagnoses.index, palette='viridis')
No description has been provided for this image

Above graph shows Stress , Social isolatio, viral sinisitus and Social contact are the top diagnosis with higher number of patients

In [37]:
# Filter procedures for the last 10 years
current_year = df_procedures['Year'].max()
last_10_years = current_year - 9
df_procedures_last_10_years = df_procedures[df_procedures['Year'] >= last_10_years]

# Shorten procedure descriptions for clarity
df_procedures_last_10_years["DESCRIPTION_SHORT"] = df_procedures_last_10_years['DESCRIPTION'].str.slice(0, 35)

# Group by year and procedure description
procedure_trends_last_10_years = (
    df_procedures_last_10_years.groupby(['Year', 'DESCRIPTION_SHORT'])
    .size()
    .reset_index(name='Count')
)

# Create the line chart
fig = px.line(
    procedure_trends_last_10_years,
    x='Year',
    y='Count',
    color='DESCRIPTION_SHORT',
    line_group='DESCRIPTION_SHORT',
    title='Trends in Medical Procedures Over the Last 10 Years',
    labels={'Year': 'Year', 'DESCRIPTION_SHORT': 'Procedure', 'Count': 'Frequency'}
)

# Save and show the visualization
fig.write_html("./output/data_visualization/trend_procedure_last_10_years.html")
fig.show()
C:\Users\veena.vemula\AppData\Local\Temp\ipykernel_9512\467863593.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_procedures_last_10_years["DESCRIPTION_SHORT"] = df_procedures_last_10_years['DESCRIPTION'].str.slice(0, 35)
In [39]:
## scatter plot
df_proc_pat = pd.merge(df_procedures,df_patients,on ="PATIENT_ID",how="left")
df_proc_pat["Procedure_age"] = df_proc_pat["START_PROC"].dt.year-df_proc_pat["BIRTHDATE"].dt.year
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, float('inf')]  # Add infinity for 90+
labels = [f"{bins[i]}-{bins[i + 1] - 1}" for i in range(len(bins) - 2)] + [f"{bins[-2]}+"]
df_proc_pat['age_proc_group'] = pd.cut(df_proc_pat['Procedure_age'], bins=bins, labels=labels, right=False)
df_group_proc_age = df_proc_pat[["age_proc_group","DESCRIPTION"]].groupby(["age_proc_group","DESCRIPTION"]).size().reset_index().rename({0:"frequency"},axis=1)
df_group_proc_age[df_group_proc_age["frequency"]!=0]

fig = px.scatter(
    df_group_proc_age,
    x='age_proc_group',               # X-axis: Age groups
    y='frequency',               # Y-axis: Frequency of procedures
    color='DESCRIPTION',           # Color by procedure type
    size='frequency',            # Bubble size (optional)
    title='Frequency of Procedures by Age Group',
    labels={'frequency': 'Frequency', 'age_group': 'Age Group'}
)


fig.write_html("./output/data_visualization/scatter_diag_freq.html")
fig.to_image(format = "png")
fig.show()
## social care needs were more in people over 60 age group
## immunotherapy was more in young people 10 to 20
## assessment of substance used procedure is seen across all group
C:\Users\veena.vemula\AppData\Local\Temp\ipykernel_9512\501261827.py:7: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

Above graph gives many insights like Assesment of healtha nd social care needs are higher in age groups 20 to 70. Depression screening is also higher fro age groups 20 to 80. Orthopaedics procedure is higher between agegroups 40 to 60

In [171]:
## number of visit by time
#from encounter data if i see multiple entries for patientid 1 shall take all or count for this patient is 1
In [42]:
# Combine year and month into a datetime column
df_encounters['date'] = pd.to_datetime(df_encounters[['Year', 'Month']].assign(day=1))  # Assume day=1

# Count unique patient IDs over time
patients_over_time = df_encounters.groupby(['Year', 'Month']).agg(num_patients=('PATIENT_ID', 'nunique')).reset_index()

# Create a slider
fig = px.bar(
    patients_over_time,
    x='Month',
    y='num_patients',
    animation_frame='Year',  # Enables the slider for years
    title='Number of Patients Over Time by Year',
    labels={'month': 'Month', 'num_patients': 'Number of Patients'},
    range_y=[0, patients_over_time['num_patients'].max() + 5]  # Adjust Y-axis range for clarity
)

# Customize layout
fig.update_layout(
    xaxis=dict(tickmode='array', tickvals=list(range(1, 13)), ticktext=['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']),
    transition={'duration': 500}  # Smooth transition between frames
)

fig.write_html("./output/data_visualization/slider_patients_visit_overtime.html")
fig.to_image(format = "png")
fig.show()
## spike of patient visist seen in 2021 covid years

Number of patients started to rise from 1970 - this may be becuase of lack of data. There is an hike in number of patients in 2020 may be because of Covid.There is an sudden increase in number of patients in 2014.

In [53]:
def get_demographic_top20_plot(df,x,y,title):
    temp_data = df.groupby([x,y]).size().reset_index(name='Count')
    fig = px.bar(
    temp_data,
    x='DESCRIPTION',
    y='Count',
    color=x,
    barmode='group',
    title=title,
    labels={'diagnosis_id': 'Diagnosis ID', 'Count': 'Frequency'}
)
    return fig
In [55]:
df_pat_cond["age_diag"] = df_pat_cond["START_DIAG"].dt.year-df_pat_cond["BIRTHDATE"].dt.year
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, float('inf')]  # Add infinity for 90+
labels = [f"{bins[i]}-{bins[i + 1] - 1}" for i in range(len(bins) - 2)] + [f"{bins[-2]}+"]
df_pat_cond['age_diag_group'] = pd.cut(df_pat_cond['age_diag'], bins=bins, labels=labels, right=False)
In [57]:
demo_diag_data = df_pat_cond[["PATIENT_ID","age_diag_group","STATE","MARITAL","RACE","ETHNICITY","GENDER","DESCRIPTION"]].drop_duplicates()
In [93]:
## dist of diag by age on top 50 diag
top20_diag = list(demo_diag_data["DESCRIPTION"].value_counts().reset_index().sort_values(by="DESCRIPTION",ascending=False).head(50)["DESCRIPTION"].values)
demo_diag_data_top20 = demo_diag_data[demo_diag_data["DESCRIPTION"].isin(top20_diag)]
fig = get_demographic_top20_plot(demo_diag_data_top20,"age_diag_group","DESCRIPTION","Distribution of top 20 diag by age_group")
fig.to_image(format = "png")
fig.show()
C:\Users\veena.vemula\AppData\Local\Temp\ipykernel_9512\2424614841.py:2: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

chronic sinustisis seen more in young people comparatively¶

unhealthy alcohal drinking seen more in young peopel¶

In [91]:
## dist of diag by race on top diag
fig = get_demographic_top20_plot(demo_diag_data_top20,"RACE","DESCRIPTION","Distribution of top 20 diag by RACE")
fig.to_image(format = "png")
fig.show()
In [65]:
df_pat_med = pd.merge(df_medications, df_patients, on='PATIENT_ID', how='inner')
df_pat_med["age_med"] = df_pat_med["START_MED"].dt.year-df_pat_med["BIRTHDATE"].dt.year ## calcualte age at the time of medicine given
bins = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, float('inf')]  # Add infinity for 90+
labels = [f"{bins[i]}-{bins[i + 1] - 1}" for i in range(len(bins) - 2)] + [f"{bins[-2]}+"]
df_pat_med['age_med_group'] = pd.cut(df_pat_med['age_med'], bins=bins, labels=labels, right=False)
In [ ]:
demo_med_data = df_pat_med[["PATIENT_ID","age_group","STATE","MARITAL","RACE","ETHNICITY","GENDER","DESCRIPTION"]].drop_duplicates()
demo_med_data## removing duplicates on above combination
In [89]:
## dist of medication by age group on top 50 =medication
top20_diag = list(demo_med_data["DESCRIPTION"].value_counts().reset_index().sort_values(by="DESCRIPTION",ascending=False).head(50)["DESCRIPTION"].values)
demo_med_data_top20 = demo_med_data[demo_med_data["DESCRIPTION"].isin(top20_diag)]
fig = get_demographic_top20_plot(demo_med_data_top20,"age_group","DESCRIPTION","Distribution of top 20 medication by Medication")
fig.to_image(format = "png")
fig.show()

Amoxcinlin Acetaminophen are given more to aged people 30+¶

seasonique 91 pack is given more to young people¶

In [87]:
## by RACE
fig = get_demographic_top20_plot(demo_med_data_top20,"RACE","DESCRIPTION","Distribution of top 20 medication by RACE")
fig.to_image(format = "png")
fig.show()
In [73]:
# •	Calculate mean, median, and standard deviation for numerical columns (e.g., age, frequency of visits).
In [85]:
aga_diag_mean = df_pat_cond["age_diag"].mean()
aga_diag_median = df_pat_cond["age_diag"].median()
aga_diag_std = df_pat_cond["age_diag"].std()
fig = px.bar( x=["Age_mean","Age_median","Age_std"],y=[aga_diag_mean,aga_diag_median,aga_diag_std], title="Mean Median Std of Age",
              color_continuous_scale='Viridis')
#fig.write_html("./output/data_visualization/top_patient_by_visit.html")
fig.to_image(format = "png")
fig.show()
In [83]:
#for frequency of visit
freq_visit_mean = df_encounters["PATIENT_ID"].value_counts().reset_index()["count"].mean()
freq_visit_median = df_encounters["PATIENT_ID"].value_counts().reset_index()["count"].median()
freq_visit_std = df_encounters["PATIENT_ID"].value_counts().reset_index()["count"].std()
fig = px.bar( x=["freq_visit_mean","freq_visit__median","freq_visit_std"],y=[freq_visit_mean,freq_visit_median,freq_visit_std], title="Mean Median Std of frequency of visit",
              color_continuous_scale='Viridis')
#fig.write_html("./output/data_visualization/top_patient_by_visit.html")
fig.to_image(format = "png")
fig.show()
 
In [81]:
# •	Calculate mean, median, and standard deviation for numerical columns  frequency of visits).
aga_diag_mean = df_pat_cond["age_diag"].mean()
aga_diag_median = df_pat_cond["age_diag"].median()
aga_diag_std = df_pat_cond["age_diag"].std()
fig = px.bar( x=["Age_mean","Age_median","Age_std"],y=[aga_diag_mean,aga_diag_median,aga_diag_std], title="Mean Median Std of Age",
              color_continuous_scale='Viridis')
#fig.write_html("./output/data_visualization/top_patient_by_visit.html")
fig.to_image(format = "png")
fig.show()
#

Mean and median are almost same with small std deviation implies the data spread for Age and frequency of visit follows normal distribution

In [ ]: